{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "35227754",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<!-- \n",
       "If you can see this code, this cell's output is not trusted.\n",
       "Please execute this cell and save the notebook, or click File -> Trust Notebook\n",
       "-->\n",
       "<script>\n",
       "var shown = true;\n",
       "\n",
       "function filter_cells_by_tag(tag) {\n",
       "    out = Array();\n",
       "    all_cells = Jupyter.notebook.get_cells()\n",
       "    for (var i=0; i<all_cells.length; i++) {\n",
       "        var curr_cell = all_cells[i];\n",
       "        var tags = curr_cell._metadata.tags;\n",
       "        if (tags != undefined) {\n",
       "            for (var j=0; j<tags.length; j++) {\n",
       "                var curr_tag = tags[j];\n",
       "                if (curr_tag == tag) {\n",
       "                    out.push(curr_cell);\n",
       "                    break;\n",
       "                }\n",
       "            }\n",
       "        }\n",
       "    }\n",
       "    return out;\n",
       "}\n",
       "\n",
       "function set_cell_visibility(tag, show, input_only) {\n",
       "    var cells = Jupyter.notebook.get_cells();\n",
       "    var marked_cells = filter_cells_by_tag(tag);\n",
       "    for (var i=0; i<marked_cells.length; i++) {\n",
       "        var curr_cell = marked_cells[i];\n",
       "        if (input_only) {\n",
       "            obj = curr_cell.input\n",
       "        } else {\n",
       "            obj = curr_cell.element\n",
       "        }\n",
       "        if (show) {\n",
       "            obj.show();\n",
       "        } else {\n",
       "            obj.hide();\n",
       "        }\n",
       "    }\n",
       "}\n",
       "\n",
       "function toggle_cell_visibility(tag) {\n",
       "    set_cell_visibility(tag, shown, false)\n",
       "    shown = ! shown;\n",
       "}\n",
       "\n",
       "set_cell_visibility('execution_cell', false, true);\n",
       "</script>\n",
       "To toggle visibility of explanation cells click <a href=\"javascript:toggle_cell_visibility('explanatory_cell')\">here</a>\n"
      ],
      "text/plain": [
       "<IPython.core.display.HTML object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "%%html\n",
    "<!-- \n",
    "If you can see this code, this cell's output is not trusted.\n",
    "Please execute this cell and save the notebook, or click File -> Trust Notebook\n",
    "-->\n",
    "<script>\n",
    "var shown = true;\n",
    "\n",
    "function filter_cells_by_tag(tag) {\n",
    "    out = Array();\n",
    "    all_cells = Jupyter.notebook.get_cells()\n",
    "    for (var i=0; i<all_cells.length; i++) {\n",
    "        var curr_cell = all_cells[i];\n",
    "        var tags = curr_cell._metadata.tags;\n",
    "        if (tags != undefined) {\n",
    "            for (var j=0; j<tags.length; j++) {\n",
    "                var curr_tag = tags[j];\n",
    "                if (curr_tag == tag) {\n",
    "                    out.push(curr_cell);\n",
    "                    break;\n",
    "                }\n",
    "            }\n",
    "        }\n",
    "    }\n",
    "    return out;\n",
    "}\n",
    "\n",
    "function set_cell_visibility(tag, show, input_only) {\n",
    "    var cells = Jupyter.notebook.get_cells();\n",
    "    var marked_cells = filter_cells_by_tag(tag);\n",
    "    for (var i=0; i<marked_cells.length; i++) {\n",
    "        var curr_cell = marked_cells[i];\n",
    "        if (input_only) {\n",
    "            obj = curr_cell.input\n",
    "        } else {\n",
    "            obj = curr_cell.element\n",
    "        }\n",
    "        if (show) {\n",
    "            obj.show();\n",
    "        } else {\n",
    "            obj.hide();\n",
    "        }\n",
    "    }\n",
    "}\n",
    "\n",
    "function toggle_cell_visibility(tag) {\n",
    "    set_cell_visibility(tag, shown, false)\n",
    "    shown = ! shown;\n",
    "}\n",
    "\n",
    "set_cell_visibility('execution_cell', false, true);\n",
    "</script>\n",
    "To toggle visibility of explanation cells click <a href=\"javascript:toggle_cell_visibility('explanatory_cell')\">here</a>\n"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "e908195c",
   "metadata": {},
   "source": [
    "# HTML Preprocessing"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "727614ba",
   "metadata": {},
   "source": [
    "This notebook defines the steps for extracting information from an HTML file. To see how to create a generalized API for all documents in the `pipeline-notebooks` directory\n",
    "\n",
    "To demonstrate how off-the-shelf Unstructured Bricks extract meaningful data from complex source documents, we will apply a series of Bricks with explanations.\n",
    "\n",
    "#### Table of Contents\n",
    "\n",
    "1. [Take a Look at a HTML File](#explore)\n",
    "1. [Custom Partitioning Bricks](#custom)\n",
    "1. [Cleaning Bricks](#cleaning)\n",
    "1. [Staging Bricks](#staging)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "3848e558",
   "metadata": {},
   "source": [
    "## Section 1: Take a Look at a HTML File <a id=\"explore\"></a>"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "71814e12",
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "import json\n",
    "\n",
    "\n",
    "def get_filename(directory, filename):\n",
    "    cwd = os.getcwd()\n",
    "    local_directory = os.path.join(os.path.split(cwd)[0], directory)\n",
    "    ci_directory = os.path.join(cwd, directory)\n",
    "\n",
    "    if os.path.exists(local_directory) and filename in os.listdir(local_directory):\n",
    "        return os.path.join(local_directory, filename)\n",
    "    elif os.path.exists(ci_directory) and filename in os.listdir(ci_directory):\n",
    "        return os.path.join(ci_directory, filename)\n",
    "    else:\n",
    "        raise FileNotFoundError"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "72f0ebc4",
   "metadata": {},
   "outputs": [],
   "source": [
    "filename = get_filename(\"sample-docs\", \"fake-html.html\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "ea3b2b58",
   "metadata": {},
   "outputs": [],
   "source": [
    "from unstructured.documents.html import HTMLDocument\n",
    "\n",
    "document = HTMLDocument.from_file(filename)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "fa146f41",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "My First Heading\n",
      "\n",
      "My first paragraph.\n"
     ]
    }
   ],
   "source": [
    "print(document)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "15d69b6b",
   "metadata": {},
   "source": [
    "## Section 2: Custom Partition Bricks"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "ff34cce7",
   "metadata": {},
   "outputs": [],
   "source": [
    "from unstructured.partition.html import partition_html\n",
    "\n",
    "elements = partition_html(filename)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "7a46b93f",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[<unstructured.documents.html.HTMLTitle object>, <unstructured.documents.html.HTMLTitle object>]\n"
     ]
    }
   ],
   "source": [
    "print(elements)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "e0312c8c",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "My First Heading\n",
      "My first paragraph.\n"
     ]
    }
   ],
   "source": [
    "for element in elements:\n",
    "    print(element.text)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "10e1d3df",
   "metadata": {},
   "source": [
    "## Section 3: Cleaning Bricks <a id=\"cleaning\"></a>"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "id": "52943c00",
   "metadata": {},
   "source": [
    "In addition to partitioning bricks, the Unstructured library has\n",
    "***cleaning*** bricks for removing unwanted content from text. In this\n",
    "case, we'll solve our punctuation problem by using the \n",
    "`remove_punctuation`. Other uses for cleaning bricks include\n",
    "cleaning out boilerplate, sentence fragments, and other segments\n",
    "of text that could impact labeling tasks or the accuracy of\n",
    "machine learning models. As with partitioning bricks, users can\n",
    "include custom cleaning bricks in a pipeline."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "268e7dcd",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'My first paragraph.'"
      ]
     },
     "execution_count": null,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#This element has a lot of new line characters\n",
    "elements[1].text"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "485198a5",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'My first paragraph'"
      ]
     },
     "execution_count": null,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from unstructured.cleaners.core import remove_punctuation\n",
    "\n",
    "remove_punctuation(elements[1].text)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "0f7fea99",
   "metadata": {},
   "source": [
    "## Section 4: Staging Bricks<a id=\"staging\"></a>"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "4f41f82c",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[{'data': {'text': 'My First Heading',\n",
       "   'ref_id': '0540311f6c077fe8f797080918b8d74b'}},\n",
       " {'data': {'text': 'My first paragraph.',\n",
       "   'ref_id': '399af454cb1368b8257ed406b430de84'}}]"
      ]
     },
     "execution_count": null,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from unstructured.staging.label_studio import stage_for_label_studio\n",
    "\n",
    "label_studio_data = stage_for_label_studio(elements)\n",
    "label_studio_data"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "python3",
   "language": "python",
   "name": "python3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
